Dependencies

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.1.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(readr)
library(viridis)
## Loading required package: viridisLite
library(e1071)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift

Data

16x

Note: there are only few individuals of the small ciliates because only cells bigger than 1000um2 are tracked at this magnification and most small ciliates (Tetrahymena, Dexiostoma and Loxocephalus) are smaller than that.

load("../data/16x/Coleps_irchel/Morph_mvt.RData")
# temp2 <- morph_mvt
# temp2$species <- "Coleps_irchel2"
dd16 <- morph_mvt

load("../data/16x/Coleps_viridis/Morph_mvt.RData")
# dd16 <- rbind(dd16, morph_mvt)

load("../data/16x/Colpidium/Morph_mvt.RData")
morph_mvt$species <- "Colpidium"
dd16 <- rbind(dd16, morph_mvt)

# load("../data/16x/Didinium/Morph_mvt.RData")
# dd16 <- rbind(dd16, morph_mvt)

load("../data/16x/Euplotes/Morph_mvt.RData")
# morph_mvt <- morph_mvt %>% filter(mean_area>3000)
dd16 <- rbind(dd16, morph_mvt)

load("../data/16x/Euplotes_second/Morph_mvt.RData")
# morph_mvt <- morph_mvt %>% filter(mean_area>3000)
dd16 <- rbind(dd16, morph_mvt)

load("../data/16x/Paramecium_bursaria/Morph_mvt.RData")
dd16 <- rbind(dd16, morph_mvt)

load("../data/16x/Paramecium_caudatum/Morph_mvt.RData")
dd16 <- rbind(dd16, morph_mvt)

load("../data/16x/Stylonychia_1/Morph_mvt.RData")
morph_mvt$species <- "Stylonychia1"
# temp <- morph_mvt
# temp$species <- "Stylonychia1_1"
dd16 <- rbind(dd16, morph_mvt)

load("../data/16x/Stylonychia_1_second/Morph_mvt.RData")
morph_mvt$species <- "Stylonychia1"
# temp2 <- morph_mvt
# temp2$species <- "Stylonychia1_2"
dd16 <- rbind(dd16, morph_mvt)

load("../data/16x/Stylonychia_2/Morph_mvt.RData")
morph_mvt$species <- "Stylonychia2"
# temp3 <- morph_mvt
dd16 <- rbind(dd16, morph_mvt)

load("../data/16x/Dexiostoma/Morph_mvt.RData")
dd16 <- rbind(dd16, morph_mvt)

load("../data/16x/Loxocephallus/Morph_mvt.RData")
dd16 <- rbind(dd16, morph_mvt)

load("../data/16x/Tetrahymena/Morph_mvt.RData")
morph_mvt$species <- "Tetrahymena"
dd16 <- rbind(dd16, morph_mvt)


## filtering

dd16 <- dd16 %>%
  filter(net_disp>50, net_speed>5)

dd16$magnification <- 1.6

table(dd16$species)
## 
##       Coleps_irchel           Colpidium          Dexiostoma            Euplotes 
##                1298                 341                  10                2028 
##       Loxocephallus Paramecium_bursaria Paramecium_caudatum        Stylonychia1 
##                   9                2652                2231                 650 
##        Stylonychia2         Tetrahymena 
##                 375                  25
dd16$species <- ifelse(dd16$species %in% c("Tetrahymena","Dexiostoma","Loxocephallus"),"Smaller_ciliates",dd16$species)

filtering out outliers

dd16$id <- 1:nrow(dd16)
dd16 <- dd16 %>% na.omit()
dd16_long <- dd16 %>%
  dplyr::select(species, mean_area,sd_area,mean_perimeter,sd_perimeter,mean_major,
        sd_major,mean_ar,sd_ar,
        mean_turning,sd_turning,gross_disp,max_net,net_disp,net_speed,max_step,
        min_step,sd_step,sd_gross_speed,max_gross_speed,min_gross_speed, id) %>%
  pivot_longer(cols=-c(id,species), names_to="variable") %>%
  dplyr::group_by(variable,species) %>%
  mutate(iqr = IQR(value),
         first_quartile = quantile(value, probs = 0.25),
         third_quartile = quantile(value, probs = 0.75),
         outlier = ifelse(value<first_quartile-1.5*iqr | value>third_quartile+1.5*iqr,T,F))



outliers <- dd16_long %>%
  dplyr::filter(outlier==T) %>%
  dplyr::group_by(species, id) %>%
  dplyr::summarise(n = n()) %>%
  dplyr::filter(n>3)
## `summarise()` has grouped output by 'species'. You can override using the `.groups` argument.
table(outliers$species)
## 
##       Coleps_irchel           Colpidium            Euplotes Paramecium_bursaria 
##                 163                  12                  78                 237 
## Paramecium_caudatum    Smaller_ciliates        Stylonychia1        Stylonychia2 
##                  78                   8                  81                  24
nrow(outliers)/nrow(dd16)
## [1] 0.07079738
dd16_filtered <- dd16 %>%
  dplyr::filter(!is.element(id,outliers$id))

dd16$removed_outliers <- F
dd16_filtered$removed_outliers <- T

dd16_comparison <- rbind(dd16,dd16_filtered)


dd16_comparison %>% 
  ggplot(aes((mean_area), col=removed_outliers))+
  geom_histogram(aes(fill=removed_outliers, y=..density..), position = "identity", alpha=0.3) +
  # geom_boxplot(outlier.alpha = 0.3) +
  theme_bw() +
  facet_wrap(~species, scales = "free_y") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

dd16_filtered %>%
  ggplot(aes(log10(mean_area)))+
  geom_density(aes(col=species))

Species compositions

species <- unique(dd16$species)
compositions <- read_csv("../../../compositions.csv")
## Rows: 15 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): composition
## dbl (19): richness, Chlamydomonas, Cryptomonas, Monoraphidium, Cosmarium, St...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
comp_name <- compositions$composition
compositions$Smaller_ciliates <- with(compositions, 
                                      ifelse(Tetrahymena == 1| Dexiostoma == 1| Loxocephallus == 1, 1, 0))

compositions <- compositions[,species]

compositions_list <- apply(compositions, 1, function(x){
  idx <- which(x==1)
  names(idx)
})

names(compositions_list) <- comp_name

Classifiers

classifier_plot_svm <- function(table, combination.nr){
  # cm <- classifier$confusion
  cm <- table
  ncol <- ncol(cm)
  cm <- apply(cm, 1, function(x) x/sum(x))
  
  cm_long <- data.frame(Predicted = factor(rep(rownames(cm),ncol), levels = rownames(cm)),
                        Truth = factor(rep(colnames(cm), each=ncol), levels = colnames(cm)),
                        Classification = as.vector(cm))
  
  plot <- cm_long %>%
    ggplot(aes(Predicted,Truth,fill=Classification))+
    geom_tile() +
    geom_text(aes(label = round(Classification, 3)), col="white") +
    scale_x_discrete(position = "top") +
    scale_y_discrete(limits = rev(levels(cm_long$Truth))) +
    scale_fill_viridis(option = "D", end=.8, discrete=FALSE)+
    theme(axis.text.x = element_text(angle = 90, hjust = 0))+
    theme(legend.text = element_text(size=14),
          axis.title = element_text(size=14),
          title = element_text(size=16),
          axis.text = element_text(size=13))+
    labs(title=paste("PPV of",combination.nr),fill="PPV")
  
  return(plot)
}


classifier_assessment_plot <- function(cf, combination.nr){

  cf.df <- cf$byClass[,1:4] %>%
    as.data.frame() %>%
    mutate(Species = gsub("Class: ", "", rownames(cf$byClass[,1:5]))) %>%
    rename("NPV" = "Neg Pred Value", "PPV" = "Pos Pred Value",
           "Sens" = "Sensitivity", "Spec" = "Specificity") %>%
    pivot_longer(cols = 1:4) %>%
    mutate(col = ifelse(value>=0.9,"1",
                        ifelse(value<0.9 & value>=0.8,"2",
                               ifelse(value<0.8 & value>=0.7,"3","4"))))

  plot <- cf.df %>%
    ggplot(aes(name,Species,fill=col))+
    geom_tile() +
    geom_text(aes(label = round(value, 3)), col="white") +
    scale_x_discrete(position = "top") +
    scale_y_discrete(limits = rev(levels(as.factor(cf.df$Species)))) +
    scale_fill_manual(values = c("#006837","#66bd63","#f46d43","#a50026"), breaks = c("1","2","3","4")) +
    theme(legend.text = element_text(size=14),
          title = element_text(size = 16),
          axis.title = element_blank(),
          axis.text = element_text(size=13),
          legend.position = "none")+
    labs(title=combination.nr, fill="")
  
  return(plot)
}
formula <- factor(species) ~ mean_area + sd_area + mean_perimeter + sd_perimeter + mean_major +
        sd_major +  mean_ar + sd_ar +
        mean_turning + sd_turning + gross_disp + max_net + net_disp + net_speed + max_step +
        min_step + sd_step + sd_gross_speed + max_gross_speed + min_gross_speed

trainingdata <- dd16[complete.cases(dd16), ]

set.seed(624)

trainingdata$species <- factor(trainingdata$species)
split_up <- split(trainingdata, f = trainingdata$species)
subsamples <- lapply(split_up, function(x) {
  x %>% sample_n(ifelse(nrow(x) < 500, nrow(x), 500))})
trainingdata <- do.call("rbind", subsamples)
# table(sub_trainingdata$species)

# A stratified random split of the data
idx_train <- createDataPartition(trainingdata$species,
                                 p = 0.65, # percentage of data as training
                                 list = FALSE)

testdata <- trainingdata[-idx_train,]
trainingdata <- trainingdata[idx_train,]
  
obj <- tune(svm, formula, data = trainingdata, kernel="radial",
            ranges = list(gamma = 2^(-8:2), cost = 2^(1:10)), probability = TRUE,
            tunecontrol = tune.control(sampling = "fix", best.model = T))




cf <- caret::confusionMatrix(predict(obj$best.model, newdata=testdata %>% select(-species)),
                             testdata$species)

classifier_plot_svm(table = cf$table,
                    combination.nr = "all")

classifier_assessment_plot(cf = cf, combination.nr = "all")

There are mainly 4 measures:

PPV is the most important for us!

formula <- factor(species) ~ mean_area + sd_area + mean_perimeter + sd_perimeter + mean_major +
        sd_major +  mean_ar + sd_ar +
        mean_turning + sd_turning + gross_disp + max_net + net_disp + net_speed + max_step +
        min_step + sd_step + sd_gross_speed + max_gross_speed + min_gross_speed

set.seed(62)
classifiers_18c_16x <- list()
classifiers_18c_16x_plots <- list()
classifiers_18c_16x_assess_plots <- list()

trainingdata <- dd16_filtered[complete.cases(dd16_filtered), ]

for(i in 1:length(compositions_list)){
  sub_trainingdata <- trainingdata %>%
    filter(species %in% c(compositions_list[[i]]))
  n.var <- length(unique(sub_trainingdata$species))
  sub_trainingdata$species <- factor(sub_trainingdata$species)

  split_up <- split(sub_trainingdata, f = sub_trainingdata$species)
  subsamples <- lapply(split_up, function(x) {
    x %>% sample_n(ifelse(nrow(x) < 500, nrow(x), 500))})
  sub_trainingdata <- do.call("rbind", subsamples)
  # table(sub_trainingdata$species)
  # A stratified random split of the data
  idx_train <- createDataPartition(sub_trainingdata$species,
                                   p = 0.7, # percentage of data as training
                                   list = FALSE)
  
  sub_testdata <- sub_trainingdata[-idx_train,]
  sub_trainingdata <- sub_trainingdata[idx_train,]
  
  n.min <- min(table(sub_trainingdata$species))
  
  obj <- tune(svm, formula, data = sub_trainingdata, kernel="radial",
            ranges = list(gamma = 2^(-8:2), cost = 2^(1:10)), probability = TRUE,
            tunecontrol = tune.control(sampling = "fix", best.model = T))
  
  classifiers_18c_16x[[i]] <- obj$best.model
  
  cf <- caret::confusionMatrix(predict(obj$best.model, newdata=sub_testdata %>% select(-species)),
                       sub_testdata$species)
  
  classifiers_18c_16x_plots[[i]] <- classifier_plot_svm(table = cf$table,
                                                        combination.nr = names(compositions_list)[[i]])
  
  classifiers_18c_16x_assess_plots[[i]] <- classifier_assessment_plot(cf, names(compositions_list)[[i]])
}

names(classifiers_18c_16x_plots) <- names(classifiers_18c_16x) <- 
  names(classifiers_18c_16x_assess_plots) <- comp_name
library(patchwork)
for(i in 1:length(classifiers_18c_16x_plots)){
  print(classifiers_18c_16x_plots[[i]] + classifiers_18c_16x_assess_plots[[i]] + 
    plot_layout(widths = c(4,2)))
}

Save classifiers

saveRDS(classifiers_18c_16x, "svm_video_classifiers_18c_16x_trained_december2021.rds")
# cl <- readRDS("classifiers_18c_16x.rds")